Given a Bank customer, build a neural network based classifier that can determine whether they will leave or not in the next 6 months.
https://github.com/GreatLearningAIML1/gl-pgp-aiml-uta-intl-may20-ssetty3.git
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import itertools
from scipy.stats import norm
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
df = pd. read_csv ('bank.csv')
df.head()
df.info()
df.describe()
# check missing values
print ('The missing values are >>> \n', df.isnull().sum())
# pandas_profiling
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title=" Bank Customer Pandas Profiling Report")
profile.to_file("Bank Customer Pandas Profiling Report.html")
profile
# Drop the columns as explored above, have no corelation seems dumb data
df1 = df.drop(["RowNumber", "CustomerId", "Surname"], axis = 1)
df1.head()
#To Understand the customer base and total opportunity with given data.
sns.countplot(df1['Exited'])
# We first review the 'Status' relation with categorical variables
fig, axarr = plt.subplots(2, 2, figsize=(20, 12))
sns.countplot(x='Geography', hue = 'Exited',data = df1, ax=axarr[0][0])
sns.countplot(x='Gender', hue = 'Exited',data = df1, ax=axarr[0][1])
sns.countplot(x='HasCrCard', hue = 'Exited',data = df1, ax=axarr[1][0])
sns.countplot(x='IsActiveMember', hue = 'Exited',data = df1, ax=axarr[1][1])
Observation: The proportion of female customers Leaving is greater than that of male customers With respect to geography,most of the people are from France and Spain. majority of the customers thatLeaving are those with credit cards and the inactive members have a greater churn.
#include relevant columns within x and y
x = df [['CreditScore','Geography','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember',
'EstimatedSalary']]
y = df[['Exited']]
x.head()
y.head()
y.info()
# Geography 10000 object & Gender 10000 object
# deal with Object data --> encode them
from sklearn.preprocessing import LabelEncoder
labelencoder_x = LabelEncoder()
x.iloc[:, 1] = labelencoder_x.fit_transform(x.iloc[:, 1]) #applying on Geography
labelencoder_x_2 = LabelEncoder()
x.iloc[:, 2] = labelencoder_x_2.fit_transform(x.iloc[:, 2]) #applying on Gender
x.head()
g=sns.pairplot(x,diag_kind="kde")
g.map_lower(sns.kdeplot, levels=4, color=".2")
plt.figure(figsize = (20,20))
sns.heatmap(x.corr(), annot=True,linewidths=0.2,annot_kws={'size':11})
# Slplit data test and Train
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
scaler = preprocessing.MinMaxScaler()
# MinMaxScalar has been used here.
#fitting the transform on test and train separately
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout #to add layers
# This adds the input layer (by specifying input dimension) AND the first hidden layer (units)
classifier.add(Dense(6, activation = 'relu', input_shape = (X_train.shape[1], )))
classifier.add(Dropout(rate=0.1))
# Adding the second hidden layer
classifier.add(Dense(6, activation = 'relu'))
classifier.add(Dropout(rate=0.1))
# Adding the output layer
# We use the sigmoid because we want probability outcomes
classifier.add(Dense(1, activation = 'sigmoid'))
classifier.summary()
#compile the model --> backpropagation -> gradient descent
classifier.compile(optimizer = 'adam', loss = "binary_crossentropy", metrics = ['accuracy'])
# fitting the NN to training set.
history = classifier.fit(X_train, y_train, batch_size=32, epochs=200, validation_split=0.1, verbose=2)
#predicting the results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5) #to classify each probability into True or False
# Representation via confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('< ---- Confusion Matrix ---- >')
sns.heatmap(cm, annot=True ,fmt='g')
print (((cm[0][0]+cm[1][1])*100)/(len(y_test)), '% of testing data was classified correctly')